library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.1.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(palmerpenguins)
ggplot(data = penguins, aes(x = bill_length_mm, y = bill_depth_mm)) + geom_point() + xlim(c(0,60)) + ylim(c(0,60))
## Warning: Removed 2 rows containing missing values (geom_point).
Should include zero if you have bars, as people can’t conceptualize what that means very well. On a line or dot plot you can have whatever baseline the preset is because people do better following what that actually means.
library(tidyverse)
library(billboard)
library(ggplot2)
head(wiki_hot_100s)
## no title artist year
## 1 1 Theme from A Summer Place Percy Faith 1960
## 2 2 He'll Have to Go Jim Reeves 1960
## 3 3 Cathy's Clown The Everly Brothers 1960
## 4 4 Running Bear Johnny Preston 1960
## 5 5 Teen Angel Mark Dinning 1960
## 6 6 I'm Sorry Brenda Lee 1960
tail(wiki_hot_100s)
## no title artist year
## 5696 95 Adventure of a Lifetime Coldplay 2016
## 5697 96 Humble and Kind Tim McGraw 2016
## 5698 97 Wicked Future 2016
## 5699 98 Tiimmy Turner Desiigner 2016
## 5700 99 See You Again Wiz Khalifa featuring Charlie Puth 2016
## 5701 100 Perfect One Direction 2016
Exercise 1:
df_2000s <- wiki_hot_100s %>% as_tibble() %>% filter(year >= 2000 & year <= 2009)
df_2000s_sum <- df_2000s %>% group_by(artist) %>%
summarise(nsongs= n()) %>%
arrange(desc(nsongs)) %>%
slice(1:20) %>%
mutate(artist = fct_reorder(artist, nsongs))
ggplot(df_2000s_sum, aes(x = artist, y = nsongs)) +
geom_col() + coord_flip() + labs(x = "Artist", y = "Number of songs in top 100")
Exercise 2: The artist for “See You Again” is Wiz Kalifa featuring Charlie Puth. This makes the song not in a Wiz Kalifa category for our plot. It should really work as one of his songs.
Exercise 3:
library(stringr)
wiki_hot_100s %>% mutate(artist = str_remove(artist, pattern = " featuring .*")) %>% tail()
## no title artist year
## 5696 95 Adventure of a Lifetime Coldplay 2016
## 5697 96 Humble and Kind Tim McGraw 2016
## 5698 97 Wicked Future 2016
## 5699 98 Tiimmy Turner Desiigner 2016
## 5700 99 See You Again Wiz Khalifa 2016
## 5701 100 Perfect One Direction 2016
Exercise 4: Lollipop chart!
ggplot(df_2000s_sum, aes(x = artist, y = nsongs)) +
geom_point() +
geom_segment(aes(x=artist, xend=artist, y=0, yend=nsongs)) + coord_flip() + labs(x = "Artist", y = "Number of songs in top 100")
Exercise 5:
ggplot(df_2000s_sum, aes(x = artist, y = nsongs)) +
geom_point(size=4, color = "blue", shape=8) +
geom_segment(color = "goldenrod1",(aes(x=artist, xend=artist, y=0, yend=nsongs))) + coord_flip() + labs(x = "Artist", y = "Number of songs in top 100")
library(rvest)
##
## Attaching package: 'rvest'
## The following object is masked from 'package:readr':
##
## guess_encoding
library(tidyverse)
library(httr)
year <- 2017
webpage <- paste0("https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_", year)
content <- webpage %>%
httr::GET(config = httr::config(ssl_verifypeer = FALSE)) %>%
read_html()
tab <- content %>% html_nodes("table")
df <- tab[[1]] %>% html_table() %>%
mutate(year = 2017)
df
## # A tibble: 100 × 4
## No. Title `Artist(s)` year
## <int> <chr> <chr> <dbl>
## 1 1 "\"Shape of You\"" Ed Sheeran 2017
## 2 2 "\"Despacito (Remix)\"" Luis Fonsi and Daddy Yankee featu… 2017
## 3 3 "\"That's What I Like\"" Bruno Mars 2017
## 4 4 "\"Humble\"" Kendrick Lamar 2017
## 5 5 "\"Something Just Like This\"" The Chainsmokers and Coldplay 2017
## 6 6 "\"Bad and Boujee\"" Migos featuring Lil Uzi Vert 2017
## 7 7 "\"Closer\"" The Chainsmokers featuring Halsey 2017
## 8 8 "\"Body Like a Back Road\"" Sam Hunt 2017
## 9 9 "\"Believer\"" Imagine Dragons 2017
## 10 10 "\"Congratulations\"" Post Malone featuring Quavo 2017
## # … with 90 more rows
get_wiki_100 <- function(year) {
## same code as before, replacing 2017 with year.
url <- paste0("https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_", year)
content <- webpage %>%
httr::GET(config = httr::config(ssl_verifypeer = FALSE)) %>%
read_html()
tab <- content %>% html_nodes("table")
df <- tab[[1]] %>% html_table() %>%
mutate(year = year)
df}
library(purrr)
year_list <- list(2017, 2018, 2019, 2020, 2021)
year_list
## [[1]]
## [1] 2017
##
## [[2]]
## [1] 2018
##
## [[3]]
## [1] 2019
##
## [[4]]
## [1] 2020
##
## [[5]]
## [1] 2021
df_all <- map(year_list, get_wiki_100)
df_all
## [[1]]
## # A tibble: 100 × 4
## No. Title `Artist(s)` year
## <int> <chr> <chr> <dbl>
## 1 1 "\"Shape of You\"" Ed Sheeran 2017
## 2 2 "\"Despacito (Remix)\"" Luis Fonsi and Daddy Yankee featu… 2017
## 3 3 "\"That's What I Like\"" Bruno Mars 2017
## 4 4 "\"Humble\"" Kendrick Lamar 2017
## 5 5 "\"Something Just Like This\"" The Chainsmokers and Coldplay 2017
## 6 6 "\"Bad and Boujee\"" Migos featuring Lil Uzi Vert 2017
## 7 7 "\"Closer\"" The Chainsmokers featuring Halsey 2017
## 8 8 "\"Body Like a Back Road\"" Sam Hunt 2017
## 9 9 "\"Believer\"" Imagine Dragons 2017
## 10 10 "\"Congratulations\"" Post Malone featuring Quavo 2017
## # … with 90 more rows
##
## [[2]]
## # A tibble: 100 × 4
## No. Title `Artist(s)` year
## <int> <chr> <chr> <dbl>
## 1 1 "\"Shape of You\"" Ed Sheeran 2018
## 2 2 "\"Despacito (Remix)\"" Luis Fonsi and Daddy Yankee featu… 2018
## 3 3 "\"That's What I Like\"" Bruno Mars 2018
## 4 4 "\"Humble\"" Kendrick Lamar 2018
## 5 5 "\"Something Just Like This\"" The Chainsmokers and Coldplay 2018
## 6 6 "\"Bad and Boujee\"" Migos featuring Lil Uzi Vert 2018
## 7 7 "\"Closer\"" The Chainsmokers featuring Halsey 2018
## 8 8 "\"Body Like a Back Road\"" Sam Hunt 2018
## 9 9 "\"Believer\"" Imagine Dragons 2018
## 10 10 "\"Congratulations\"" Post Malone featuring Quavo 2018
## # … with 90 more rows
##
## [[3]]
## # A tibble: 100 × 4
## No. Title `Artist(s)` year
## <int> <chr> <chr> <dbl>
## 1 1 "\"Shape of You\"" Ed Sheeran 2019
## 2 2 "\"Despacito (Remix)\"" Luis Fonsi and Daddy Yankee featu… 2019
## 3 3 "\"That's What I Like\"" Bruno Mars 2019
## 4 4 "\"Humble\"" Kendrick Lamar 2019
## 5 5 "\"Something Just Like This\"" The Chainsmokers and Coldplay 2019
## 6 6 "\"Bad and Boujee\"" Migos featuring Lil Uzi Vert 2019
## 7 7 "\"Closer\"" The Chainsmokers featuring Halsey 2019
## 8 8 "\"Body Like a Back Road\"" Sam Hunt 2019
## 9 9 "\"Believer\"" Imagine Dragons 2019
## 10 10 "\"Congratulations\"" Post Malone featuring Quavo 2019
## # … with 90 more rows
##
## [[4]]
## # A tibble: 100 × 4
## No. Title `Artist(s)` year
## <int> <chr> <chr> <dbl>
## 1 1 "\"Shape of You\"" Ed Sheeran 2020
## 2 2 "\"Despacito (Remix)\"" Luis Fonsi and Daddy Yankee featu… 2020
## 3 3 "\"That's What I Like\"" Bruno Mars 2020
## 4 4 "\"Humble\"" Kendrick Lamar 2020
## 5 5 "\"Something Just Like This\"" The Chainsmokers and Coldplay 2020
## 6 6 "\"Bad and Boujee\"" Migos featuring Lil Uzi Vert 2020
## 7 7 "\"Closer\"" The Chainsmokers featuring Halsey 2020
## 8 8 "\"Body Like a Back Road\"" Sam Hunt 2020
## 9 9 "\"Believer\"" Imagine Dragons 2020
## 10 10 "\"Congratulations\"" Post Malone featuring Quavo 2020
## # … with 90 more rows
##
## [[5]]
## # A tibble: 100 × 4
## No. Title `Artist(s)` year
## <int> <chr> <chr> <dbl>
## 1 1 "\"Shape of You\"" Ed Sheeran 2021
## 2 2 "\"Despacito (Remix)\"" Luis Fonsi and Daddy Yankee featu… 2021
## 3 3 "\"That's What I Like\"" Bruno Mars 2021
## 4 4 "\"Humble\"" Kendrick Lamar 2021
## 5 5 "\"Something Just Like This\"" The Chainsmokers and Coldplay 2021
## 6 6 "\"Bad and Boujee\"" Migos featuring Lil Uzi Vert 2021
## 7 7 "\"Closer\"" The Chainsmokers featuring Halsey 2021
## 8 8 "\"Body Like a Back Road\"" Sam Hunt 2021
## 9 9 "\"Believer\"" Imagine Dragons 2021
## 10 10 "\"Congratulations\"" Post Malone featuring Quavo 2021
## # … with 90 more rows
df_2017_present <- bind_rows(df_all)
df_2017_present <- df_2017_present %>%
mutate(Title = str_remove_all(Title, pattern = "\"")) %>% ## get rid of \ in title
rename(no = No.,
title = Title,
artist = `Artist(s)`)
wiki_tibble <- as_tibble(wiki_hot_100s) %>% ## convert billboard data to tibble
mutate(year = as.numeric(year),
no = as.integer(no))
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
hot100_df <- bind_rows(wiki_tibble, df_2017_present)
hot100_df = hot100_df %>% mutate(artist = str_remove(artist, pattern = " featuring .*"))
Exercise 6:
df_2010s <-hot100_df %>% filter(year >= 2010 & year <= 2019)
df_2010s_sum <- df_2010s %>% group_by(artist) %>%
summarise(nsongs= n()) %>%
arrange(desc(nsongs)) %>%
slice(1:20) %>%
mutate(artist = fct_reorder(artist, nsongs))
ggplot(df_2010s_sum, aes(x = artist, y = nsongs)) +
geom_point(size=4, color = "blue", shape=8) +
geom_segment(color = "goldenrod1",(aes(x=artist, xend=artist, y=0, yend=nsongs))) + coord_flip() + labs(x = "Artist", y = "Number of songs in top 100")
Exercise 7:
We are pulling data from a website and then putting it into an html format that R can understand. Then we put it into a table with all of the years labeled. Then with the purr package we remove quotes and capitals so that it all looks the same and can be sorted and trimmed.
library(tidyverse)
hpi_df <- read_csv("data/hpi-tidy.csv")
## Rows: 151 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Country, GovernanceRank, Region
## dbl (8): HPIRank, LifeExpectancy, Wellbeing, HappyLifeYears, Footprint, Happ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
hpi_df
## # A tibble: 151 × 11
## HPIRank Country LifeExpectancy Wellbeing HappyLifeYears Footprint
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 109 Afghanistan 48.7 4.76 29.0 0.540
## 2 18 Albania 76.9 5.27 48.8 1.81
## 3 26 Algeria 73.1 5.24 46.2 1.65
## 4 127 Angola 51.1 4.21 28.2 0.891
## 5 17 Argentina 75.9 6.44 55.0 2.71
## 6 53 Armenia 74.2 4.37 41.9 1.73
## 7 76 Australia 81.9 7.41 65.5 6.68
## 8 48 Austria 80.9 7.35 64.3 5.29
## 9 80 Azerbaijan 70.7 4.22 39.1 1.97
## 10 146 Bahrain 75.1 4.55 43.5 6.65
## # … with 141 more rows, and 5 more variables: HappyPlanetIndex <dbl>,
## # Population <dbl>, GDPcapita <dbl>, GovernanceRank <chr>, Region <chr>
making scatterplot
ggplot(data = hpi_df, aes(x = Footprint, y = Wellbeing)) +
geom_point()
labeling points: make a new data set with just the one we want to label then put in ggplot that we want to label just that data set
hpi_us <- hpi_df %>% filter(Country == "United States of America")
hpi_us
## # A tibble: 1 × 11
## HPIRank Country LifeExpectancy Wellbeing HappyLifeYears Footprint
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 105 United States of Am… 78.5 7.16 61.3 7.19
## # … with 5 more variables: HappyPlanetIndex <dbl>, Population <dbl>,
## # GDPcapita <dbl>, GovernanceRank <chr>, Region <chr>
ggplot(data = hpi_df, aes(x = Footprint, y = Wellbeing)) +
geom_point() +
geom_label(data = hpi_us, aes(label = Country))
editing point
library(ggrepel)
ggplot(data = hpi_df, aes(x = Footprint, y = Wellbeing)) +
geom_point() +
geom_label_repel(data = hpi_us, aes(label = Country)) +
geom_point(data = hpi_us, size = 3, shape = 1)
Exercise 1:
labeling 3 different countries
hpi_ARB <- hpi_df %>% filter(Country == "Australia"|Country == "Russia"|Country == "Brazil")
hpi_ARB
## # A tibble: 3 × 11
## HPIRank Country LifeExpectancy Wellbeing HappyLifeYears Footprint
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 76 Australia 81.9 7.41 65.5 6.68
## 2 21 Brazil 73.5 6.84 55.5 2.93
## 3 122 Russia 68.8 5.46 44.7 4.40
## # … with 5 more variables: HappyPlanetIndex <dbl>, Population <dbl>,
## # GDPcapita <dbl>, GovernanceRank <chr>, Region <chr>
ggplot(data = hpi_df, aes(x = Footprint, y = Wellbeing)) +
geom_point() +
geom_label_repel(data = hpi_ARB, aes(label = Country)) +
geom_point(data = hpi_ARB, size = 3, shape = 1)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:httr':
##
## config
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
plot1 <- ggplot(data = hpi_df, aes(x = Footprint, y = Wellbeing)) +
geom_point()
ggplotly(plot1)
To get country names on the plot we add a label argument to the plot and then we say tooltip = label
plot1 <- ggplot(data = hpi_df, aes(x = Footprint, y = Wellbeing,
label = Country)) +
geom_point()
ggplotly(plot1, tooltip = "label")
This is the uniform way to use plotly() so we really like plotly.
Exercise 2:
plot2 <- ggplot(df_2010s_sum, aes(x = artist, y = nsongs, label = artist)) +
geom_point(size=4, color = "blue", shape=8) +
geom_segment(color = "goldenrod1",(aes(x=artist, xend=artist, y=0, yend=nsongs))) + coord_flip() + labs(x = "Artist", y = "Number of songs in top 100")
ggplotly(plot2, tooltip = "label")
Exercise 3:
Some advantages are that in things like scatterplots you don’t need all of the labels all of the time to be able to understand what the plot is showing you. It can be a lot more pleasent to look at while still providing all of the information. It is also really cool if you are showing the plot to an audience where they can use the interactive features.
Some disadvantage are that in something like a bar chart it is harder to tell how things compare because you cannot see all of the labels at once. Additionally, having interactive labeling takes away the option of a concrete way of sharing the plot. You cannot print an interactive plot and have it work.
ggplot(data = hpi_df, aes(x = Footprint, y = Wellbeing)) +
geom_point() +
geom_label_repel(data = hpi_us, aes(label = Country)) +
geom_point(data = hpi_us, size = 3, shape = 1) +
labs(title = "Countries with a Higher Ecological Footprint Tend to Have Citizens with Higher Wellbeing", ## add title
subtitle = "Wellbeing is on a 1-10 scale", ## add subtitle (smaller text size than the title)
caption = "Data Source: http://happyplanetindex.org/countries", ## add caption to the bottom of the figure
x = "Ecological Footprint", ## change x axis label
y = "Wellbeing") ## change y axis label
Exercise 4:
ggplot(data = hpi_df, aes(x = Footprint, y = HappyLifeYears, colour = Region)) +
geom_point() +
scale_colour_brewer(palette = "Accent")
- We are using a qual scale here, so from the middle section of the page of scales. Unordered, random things.
Exercise 5:
ggplot(data = hpi_df, aes(x = Footprint, y = HappyLifeYears, colour = Region)) +
geom_point() +
scale_colour_brewer(palette = "Set1")
- I like my scale better, the yellow is a little hard to see but not too bad. Overall, it is more vivid colors and easier to see, in my opinion.
ggplot(data = hpi_df, aes(x = Footprint, y = HappyLifeYears, colour = Region)) +
geom_point() +
scale_colour_viridis_d(option = "plasma")
Exercise 6: scale_color_viridis_d is for discrete data, like points or bars scale_color_viridis_c is for continuous data like a geom_tile plot scale_color_viridis_b is for continuous data before mapping is done